\(~\)
library(lubridate)
library(tidyverse)
library(gridExtra)
library(grid)
library(kableExtra)
library(dplyr)
library(dummies)
library(car)
library(glmnet)
library(leaps)
library(caret)
library(pls)
library(MASS)
library(randomForest)
library(tree)
library(gbm)
library(leaps)
library(glmnet)
library(rpart)
library(rpart.plot)
library(broom)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(e1071)
library(vip)
\(~\)
data.read <- read.csv("https://www.skra.is/library/Samnyttar-skrar-/Fyrirtaeki-stofnanir/Fasteignamat-2019/gagnasafn_ib_2018.csv", header=T, sep=";", dec=",", fileEncoding="latin1")
\(~\)
data <- subset(data.read, select=c(rfastnum, kdagur, nuvirdi, teg_eign, svfn, byggar, efstah,
fjmib,lyfta, ibm2, fjhaed, fjbilsk, fjbkar, fjsturt,
fjklos, fjeld, fjherb, fjstof, fjgeym, stig10, bilskurm2,
svalm2, geymm2, matssvaedi, undirmatssvaedi, ibteg))
The variables that were not mentioned in the project description were discarded. So were variables that were very similar to other variables.
\(~\)
str(data)
## 'data.frame': 49777 obs. of 26 variables:
## $ rfastnum : int 10803303 10403734 10662477 10953287 10872101 10003894 10618062 10362378 10149692 10423071 ...
## $ kdagur : Factor w/ 1953 levels "01.01.2013","01.01.2014",..: 1795 139 1722 1801 255 1689 325 768 1618 132 ...
## $ nuvirdi : int 16400 27871 34592 21205 24243 21198 47331 11550 12411 25175 ...
## $ teg_eign : Factor w/ 12 levels "Einbýlishús",..: 6 6 6 1 6 6 1 6 1 6 ...
## $ svfn : int 0 0 1300 3000 1400 5200 0 0 2000 1000 ...
## $ byggar : Factor w/ 149 levels "(null)","1787",..: 104 110 135 96 122 90 127 75 65 119 ...
## $ efstah : int 4 10 3 1 3 3 2 2 2 3 ...
## $ fjmib : int 1 1 1 1 1 1 1 1 1 1 ...
## $ lyfta : int 0 1 0 0 0 0 0 0 0 0 ...
## $ ibm2 : num 57.7 72.8 116.8 106.8 96 ...
## $ fjhaed : int 1 1 1 1 1 1 3 1 3 1 ...
## $ fjbilsk : int 0 1 0 0 0 0 0 0 0 0 ...
## $ fjbkar : int 1 1 1 0 1 1 1 0 1 1 ...
## $ fjsturt : int 0 0 1 1 0 0 1 1 0 0 ...
## $ fjklos : int 1 1 1 1 1 1 2 1 1 1 ...
## $ fjeld : int 1 1 1 1 1 1 1 1 1 1 ...
## $ fjherb : int 1 2 3 2 3 2 5 1 5 2 ...
## $ fjstof : int 1 1 1 1 1 1 2 1 1 1 ...
## $ fjgeym : int 1 1 1 0 1 1 4 1 0 0 ...
## $ stig10 : num 10 10 10 10 10 10 9.9 10 10 10 ...
## $ bilskurm2 : Factor w/ 642 levels "-70","(null)",..: 3 3 3 171 3 3 564 3 123 134 ...
## $ svalm2 : Factor w/ 723 levels "-0,2","(null)",..: 3 617 664 3 3 3 3 3 3 549 ...
## $ geymm2 : Factor w/ 769 levels "(null)","0","0,1",..: 55 2 2 2 27 2 2 14 2 656 ...
## $ matssvaedi : int 161 72 700 3000 620 5110 290 100 2050 330 ...
## $ undirmatssvaedi: int 0 0 0 0 0 0 0 0 0 0 ...
## $ ibteg : int 12 12 12 11 12 11 11 12 11 12 ...
A few variables were of incorrect type. Those were changed.
\(~\)
#Numerical to categorical
data <- mutate(data, rfastnum=factor(rfastnum), svfn=factor(svfn), efstah=factor(efstah),
lyfta=factor(lyfta), stig10=factor(stig10), matssvaedi=factor(matssvaedi),
undirmatssvaedi=factor(undirmatssvaedi), ibteg=factor(ibteg), svfn=factor(svfn))
#Categorical to numerical
data$bilskurm2 <- as.numeric(data$bilskurm2)
data$svalm2 <- as.numeric(data$svalm2)
data$geymm2 <- as.numeric(data$geymm2)
#Factor to date
data$kdagur <- as.Date(data$kdagur, "%d.%m.%Y")
#Fix variable byggar
data$byggar <- as.Date(data$byggar, "%Y"); data$byggar <- lubridate::year(data$byggar)
#Price in million ISK
data$nuvirdi_m <- data$nuvirdi/1000
str(data)
## 'data.frame': 49777 obs. of 27 variables:
## $ rfastnum : Factor w/ 41823 levels "10000034","10000075",..: 33627 16891 27721 39895 36586 165 25848 15171 6243 17659 ...
## $ kdagur : Date, format: "2015-01-29" "2015-02-03" ...
## $ nuvirdi : int 16400 27871 34592 21205 24243 21198 47331 11550 12411 25175 ...
## $ teg_eign : Factor w/ 12 levels "Einbýlishús",..: 6 6 6 1 6 6 1 6 1 6 ...
## $ svfn : Factor w/ 68 levels "0","1000","1100",..: 1 1 4 13 5 29 1 1 8 2 ...
## $ byggar : num 1973 1979 2004 1965 1991 ...
## $ efstah : Factor w/ 14 levels "0","1","2","3",..: 5 11 4 2 4 4 3 3 3 4 ...
## $ fjmib : int 1 1 1 1 1 1 1 1 1 1 ...
## $ lyfta : Factor w/ 8 levels "0","1","2","3",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ ibm2 : num 57.7 72.8 116.8 106.8 96 ...
## $ fjhaed : int 1 1 1 1 1 1 3 1 3 1 ...
## $ fjbilsk : int 0 1 0 0 0 0 0 0 0 0 ...
## $ fjbkar : int 1 1 1 0 1 1 1 0 1 1 ...
## $ fjsturt : int 0 0 1 1 0 0 1 1 0 0 ...
## $ fjklos : int 1 1 1 1 1 1 2 1 1 1 ...
## $ fjeld : int 1 1 1 1 1 1 1 1 1 1 ...
## $ fjherb : int 1 2 3 2 3 2 5 1 5 2 ...
## $ fjstof : int 1 1 1 1 1 1 2 1 1 1 ...
## $ fjgeym : int 1 1 1 0 1 1 4 1 0 0 ...
## $ stig10 : Factor w/ 28 levels "4","5.5","6.3",..: 28 28 28 28 28 28 27 28 28 28 ...
## $ bilskurm2 : num 3 3 3 171 3 3 564 3 123 134 ...
## $ svalm2 : num 3 617 664 3 3 3 3 3 3 549 ...
## $ geymm2 : num 55 2 2 2 27 2 2 14 2 656 ...
## $ matssvaedi : Factor w/ 167 levels "11","20","25",..: 19 6 55 71 49 108 32 12 67 35 ...
## $ undirmatssvaedi: Factor w/ 73 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ibteg : Factor w/ 2 levels "11","12": 2 2 2 1 2 1 1 2 1 2 ...
## $ nuvirdi_m : num 16.4 27.9 34.6 21.2 24.2 ...
All variables now seemed to be of the correct type. Next we wanted to discard some useless observations and combine some categories of some of the categorical variables.
\(~\)
nrow(data)
## [1] 49777
data <- na.omit(data)
nrow(data)
## [1] 49760
A total of 17 rows had missing values and these were removed. All seemed to involve the variable “byggar”.
\(~\)
nrow(data)
## [1] 49760
capital <- c(0, 1000, 1100, 1300, 1400, 1604) #Numbers for capital area
data <- filter(data, svfn%in%capital)
data <- mutate(data, svfn = fct_recode(svfn, Reykjavík="0", Kópavogur="1000",
Seltjarnarnes="1100", Garðabær="1300",
Hafnarfjörður="1400", Mosfellsbær="1604"))
nrow(data)
## [1] 35116
outsidecapital <- c(999) #Number for Reykjavik rural area (single property)
data <- filter(data, matssvaedi != outsidecapital)
nrow(data)
## [1] 35115
We have in this step removed 14,645 observations that don´t belong to Reykjavík capital and now approximately 35 thousand observations remain.
\(~\)
There are many types of properties (teg_eign categorical variable has 12 levels). Are all these categories necessary?
summary(data$teg_eign)
## Einbýlishús Fjölbýlishús Gistihús Herbergi Hótelstarfsemi
## 3176 19 0 1 3
## Íbúðareign Íbúðarhús Ósamþykkt íbúð Parhús Raðhús
## 28766 47 66 780 2253
## Séreign Vinnustofa
## 2 2
We see that several categories have no or very few observations (<5).
We remove the following categories with few (<5) observations: guest house, room, hotel, private property, working area. (Gistihús, Herbergi, Hótelstarfsemi, Séreign, Vinnustofa)
teg.eign.in <- c("Einbýlishús", "Fjölbýlishús", "Íbúðareign", "Íbúðarhús", "Ósamþykkt íbúð", "Parhús", "Raðhús")
data <- filter(data, teg_eign%in%teg.eign.in)
data <- droplevels(data) # drop unused levels in the dataset
nrow(data)
## [1] 35107
After this cleaning we have removed 8 properties and there are 35.107 observations left.
\(~\)
levels(data$teg_eign)
## [1] "Einbýlishús" "Fjölbýlishús" "Íbúðareign" "Íbúðarhús"
## [5] "Ósamþykkt íbúð" "Parhús" "Raðhús"
data <- mutate(data, teg_eign = fct_recode(teg_eign, "single-family property"="Einbýlishús",
"apartment building"="Fjölbýlishús",
"apartment"="Íbúðareign",
"apartment building"="Íbúðarhús",
"illegal apartment"="Ósamþykkt íbúð",
"two-family building"="Parhús",
"town-house"="Raðhús"))
levels(data$teg_eign)
## [1] "single-family property" "apartment building"
## [3] "apartment" "illegal apartment"
## [5] "two-family building" "town-house"
\(~\)
data$kdagur <- format(data$kdagur,'%Y') #Take away dates and only leave years
data$byggar_cat <- cut(data$byggar, seq(1840,2020, 5)) #Cut year built every five years
data$matssvaedi_samein <- as.factor(data$matssvaedi)
data$matssvaedi_samein <- recode(data$matssvaedi_samein, "c('600','620','630','640','650','660','670','680')='Hafnarfjörður';
c('500','510','511','520','530','540')='GarðabærNorður';
c('550','560')='GarðabærSuður';
c('300','320','330','340','350','351')='Kópavogur'; c('800','810','820','840','290')='MosfellsbærVest/Kjalarnes';
c('850')='MosfellsbærHelgafell';
c('110','120','130','140','180','181')='Grafarvogur/Grafarholt';
c('700')='Álftanes';
c('400')='Seltjarnarnes';
c('20','25','31')='MiðbærRvk';
c('11','70','72','75')='VesturbærRvk';
c('80','85','90','91','100')='AusturRvk';
c('280','281','282','283','284')='AustAusturRvk';
c('150','160','161','170')='Breiðholt';
c('200','210','220','270')='Árbær'")
All dates for each year bought (kdagur) were combined - Or rather, dates and months were removed from kdagur so only the year remained.
Also, new variable was made for the variable year built (byggar) by combining all observations every five years.
Finally, some of the categories in the variable area (matssvaedi) were renamed and combined.
At last combine some categories for undirmatssvaedi
# group undirmatssvaedi
seeside <- c('105','106','18','49','8','53','32','34','60','33','50','10','6','54','11','61','104','47','58','9','45','51','52','57')
data$undirmatssvaedi_cat <- ifelse(data$undirmatssvaedi %in% seeside, "1", "0")
table(data$undirmatssvaedi_cat)
##
## 0 1
## 33649 1458
\(~\)
data <- mutate(data, Price_m2 = nuvirdi/ibm2)
summary(data$Price_m2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 74.79 285.79 344.64 361.66 421.64 8652.91
ggplot(data = data, aes(y=Price_m2)) + geom_boxplot() + theme_classic()
The median price/m2 is around 345 thousand ISK/m2.
There is one outlier with price/m2 = 8652.91. Look more closely at the outlier.
data %>% filter(Price_m2>2000)
## rfastnum kdagur nuvirdi teg_eign svfn byggar efstah fjmib lyfta
## 1 10543921 2016 956146 apartment Mosfellsbær 2016 3 1 1
## ibm2 fjhaed fjbilsk fjbkar fjsturt fjklos fjeld fjherb fjstof fjgeym
## 1 110.5 1 1 0 1 1 1 2 1 2
## stig10 bilskurm2 svalm2 geymm2 matssvaedi undirmatssvaedi ibteg
## 1 10 3 157 401 850 0 12
## nuvirdi_m byggar_cat matssvaedi_samein undirmatssvaedi_cat Price_m2
## 1 956.146 (2015,2020] MosfellsbærHelgafell 0 8652.905
data <- filter(data, Price_m2<2000)
nrow(data)
## [1] 35106
The outlier is a 2-room apartment in Mosfellsbær with a price of 956 million ISK - very unlikely. We remove this single observation.
#Look at the distribution of price/m2 again
summary(data$Price_m2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 74.79 285.79 344.64 361.42 421.63 1160.47
ggplot(data = data, aes(y=Price_m2)) + geom_boxplot() + theme_classic()
cleanup = theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(color = "black")) #Make cleanup-code
ggplot(data = data, aes(Price_m2, fill=teg_eign)) + geom_histogram(binwidth = 10) + theme_classic() + cleanup + scale_y_continuous(expand=c(0,0), limits=c(0,1750))
ggplot(data, aes(x=kdagur, y=nuvirdi_m)) +
geom_boxplot(fill='#A4A4A4', color="black") +
theme_classic() + ggtitle("Price with outliers") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p1
ggplot(data, aes(x=kdagur, y=nuvirdi_m)) +
geom_boxplot(outlier.shape=NA, fill='#A4A4A4', color="black") +
theme_classic() + scale_y_continuous(limits = c(0, 100)) +
ggtitle("Price without outliers") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p2
grid.arrange(p1, p2, ncol = 2)
ggplot(data, aes(x=kdagur, y=Price_m2)) +
geom_boxplot(fill='#A4A4A4', color="black") +
theme_classic() + ggtitle("Price/m2 with outliers") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p1
ggplot(data, aes(x=kdagur, y=Price_m2)) +
geom_boxplot(outlier.shape=NA, fill='#A4A4A4', color="black") +
theme_classic() + scale_y_continuous(limits = c(0, 1000)) +
ggtitle("Price/m2 without outliers") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p2
grid.arrange(p1, p2, ncol = 2)
We see that real estate prices have been steadily increasing in the last few years.
\(~\)
ggplot(data = data, aes(byggar, fill=teg_eign)) + geom_histogram(binwidth = 2) + theme_classic() +
labs(x="Year built", y="Number of properties", fill="Type of property") + scale_y_continuous(expand=c(0,0), limits=c(0,2000))
ggplot(data, aes(x=byggar_cat, y=Price_m2)) +
geom_boxplot(fill='#A4A4A4', color="black") +
theme_classic() + ggtitle("Price/m2 and year of construction") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Year of construction") + ylab("Sale price (thousand ISK)") +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)
Thus the oldest (built before 1940) and newest (built after 2010) properties are most expensive. The brand new properties built after 2015 have the highest price.
The highest price/m2 are for the years 2016-2018, and also for a very few old houses built before or around 1900. Thus the oldest (built before 1940) and newest (built after 2010) properties are most expensive.
The years on top regarding number of constructions (numbers and price listed) were 2006 and 2007 and 2004, just before the economic “crash” (Hrunið) in 2008.
The year 2009 just after the crash shows a very sharp downturn in the construction activity - more than 7-fold decrease. However, the price stayed similar. The years 2009 and 2010 were also slow, but then the recovery started in 2013 and really exploded in 2014 followed by some decrease in 2015.
\(~\)
cleanup_point =
theme(panel.background = element_blank(), axis.line = element_line(color = "black"), panel.grid.major = element_blank(), legend.key=element_blank()) #Cleanup for scatterplot
ggplot(data, aes(x=ibm2, y=nuvirdi, color=teg_eign)) + geom_point() +
labs(x="Property size", y="Sale price", color="Type of property") +
cleanup_point -> p1
ggplot(data, aes(x=ibm2, y=Price_m2, color=teg_eign)) + geom_point() +
labs(x="Property size", y="Price/m2", color="Type of property") +
cleanup_point -> p2
grid.arrange(p1, p2, ncol = 2)
We see that price generally increases with property size but price/m2 tends to decrease.
\(~\)
ggplot(data, aes(matssvaedi, fill=teg_eign)) + geom_bar() + theme_classic() +
xlab("Area") + ylab("Number of properties") + labs(fill="Type of property") + scale_y_continuous(expand=c(0,0), limits=c(0,2500)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
\(~\)
ggplot(data, aes(matssvaedi_samein, fill=teg_eign)) + geom_bar() + theme_classic() +
xlab("Area") + ylab("Number of properties") + labs(fill="Type of property") +
scale_y_continuous(expand=c(0,0), limits=c(0,6500)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
\(~\)
na.omit(data) %>%
group_by(data$matssvaedi) %>%
summarize(Count=n(),
Meðalfermetraverð = round(mean(Price_m2),1),
Staðalfrávik = round(sd(Price_m2),1),) %>%
mutate(Percent = round((Count/sum(Count)*100))) %>%
arrange(desc(Meðalfermetraverð))
## # A tibble: 60 x 5
## `data$matssvaedi` Count Meðalfermetraverð Staðalfrávik Percent
## <fct> <int> <dbl> <dbl> <dbl>
## 1 31 89 497. 164. 0
## 2 550 253 468. 71.2 1
## 3 20 1663 454. 134. 5
## 4 25 536 451. 124 2
## 5 75 111 435 123. 0
## 6 11 351 429. 116 1
## 7 560 58 422. 91.6 0
## 8 90 1714 421. 116. 5
## 9 70 496 418. 110 1
## 10 400 645 414. 109. 2
## # … with 50 more rows
We see that are nr. 31 has by far the highest price/m2 (Miðbær:Suður-Þingholt). Second comes nr. 550 (Garðabær:Urriðaholt) and then nr. 20 (Miðbær:Frá Tjörn að Snorrabraut).
\(~\)
ggplot(data, aes(x=matssvaedi_samein, y=Price_m2)) + geom_boxplot() +
theme_classic() + labs(x="Area", y="Price per m^2 (thousand ISK)") + ylim(0,1000) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)
\(~\)
ggplot(data, aes(x=matssvaedi_samein, y=byggar)) +
geom_boxplot(fill='#A4A4A4', color="black") +
theme_classic() + ggtitle("Location vs. year of construction") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Location (matssvaedi)") + ylab("Year of construction")
Some properties with really high prices in RvkMiðbær - where are these located exactly (sub-areas/undirmatssvaedi)?
data.s <- data[rev(order(data$Price_m2, na.last = FALSE)), ]
data.s[1:20,]
## rfastnum kdagur nuvirdi teg_eign svfn byggar
## 28415 10805575 2017 24486 apartment Reykjavík 1947
## 4703 10547338 2012 65519 apartment Reykjavík 2014
## 19089 10960906 2015 142541 apartment Reykjavík 2015
## 19090 10158390 2015 168311 apartment Reykjavík 2015
## 31419 10553943 2017 251019 single-family property Reykjavík 1954
## 34048 10805028 2017 23000 apartment Reykjavík 1958
## 27218 10062144 2016 47000 two-family building Reykjavík 1926
## 9314 10238187 2013 129149 apartment Reykjavík 2008
## 34567 10236346 2018 44850 apartment Reykjavík 2004
## 27515 10846742 2016 56409 town-house Reykjavík 1948
## 32214 10506190 2017 165000 apartment Reykjavík 2008
## 34568 10052930 2018 45784 apartment Reykjavík 2004
## 24426 10633210 2016 170460 apartment Seltjarnarnes 2016
## 24818 10379194 2016 23344 apartment Reykjavík 1905
## 33308 10584240 2017 125710 apartment Reykjavík 2015
## 25776 10878324 2016 122524 single-family property Reykjavík 1927
## 29107 10356430 2017 32986 single-family property Hafnarfjörður 1900
## 10150 10327189 2014 122766 apartment Reykjavík 2015
## 33082 10067667 2017 192228 single-family property Reykjavík 1935
## 31784 10768751 2017 86469 apartment Reykjavík 1934
## efstah fjmib lyfta ibm2 fjhaed fjbilsk fjbkar fjsturt fjklos fjeld
## 28415 2 1 0 21.1 2 0 1 0 1 1
## 4703 7 1 3 58.8 1 1 0 1 1 1
## 19089 9 1 3 128.8 1 2 1 1 2 1
## 19090 9 1 3 165.0 1 2 1 1 2 1
## 31419 2 2 0 247.7 2 0 1 2 3 1
## 34048 4 1 1 22.7 1 0 0 1 1 1
## 27218 1 1 0 46.4 1 0 0 1 1 1
## 9314 9 1 3 128.8 1 3 1 2 2 1
## 34567 4 1 0 44.8 1 0 0 1 1 1
## 27515 1 1 0 56.6 1 0 0 1 1 1
## 32214 9 1 3 166.4 1 1 1 1 2 1
## 34568 4 1 0 46.2 1 0 0 1 1 1
## 24426 5 1 0 173.5 1 2 0 1 1 1
## 24818 2 1 0 23.8 1 0 0 1 1 1
## 33308 9 1 3 128.8 1 1 1 1 2 1
## 25776 2 2 0 127.4 1 0 1 0 1 1
## 29107 1 1 0 34.3 1 0 0 1 1 1
## 10150 9 1 3 128.8 1 2 1 1 2 1
## 33082 2 1 0 203.2 2 0 1 1 2 1
## 31784 2 1 0 91.6 1 0 1 0 1 1
## fjherb fjstof fjgeym stig10 bilskurm2 svalm2 geymm2 matssvaedi
## 28415 1 1 1 10 3 3 2 100
## 4703 1 1 1 10 3 549 655 20
## 19089 2 1 0 10 3 85 744 20
## 19090 2 1 0 10 3 59 713 20
## 31419 10 2 1 10 286 221 194 70
## 34048 0 1 0 7 3 382 2 90
## 27218 2 1 0 10 3 3 463 20
## 9314 2 1 0 10 3 85 661 20
## 34567 1 1 0 10 3 698 296 20
## 27515 1 1 0 10 3 3 2 20
## 32214 2 1 0 10 3 59 77 20
## 34568 1 1 0 10 3 3 294 20
## 24426 3 1 1 10 3 249 388 400
## 24818 0 1 0 10 3 3 56 20
## 33308 2 1 0 10 3 85 709 20
## 25776 2 3 0 10 3 3 2 25
## 29107 2 1 0 10 3 3 148 600
## 10150 2 1 0 10 3 85 661 20
## 33082 4 3 0 10 75 550 520 31
## 31784 4 2 1 10 436 613 437 31
## undirmatssvaedi ibteg nuvirdi_m byggar_cat matssvaedi_samein
## 28415 0 12 24.486 (1945,1950] AusturRvk
## 4703 47 12 65.519 (2010,2015] MiðbærRvk
## 19089 51 12 142.541 (2010,2015] MiðbærRvk
## 19090 51 12 168.311 (2010,2015] MiðbærRvk
## 31419 6 11 251.019 (1950,1955] VesturbærRvk
## 34048 0 12 23.000 (1955,1960] AusturRvk
## 27218 0 11 47.000 (1925,1930] MiðbærRvk
## 9314 51 12 129.149 (2005,2010] MiðbærRvk
## 34567 0 12 44.850 (2000,2005] MiðbærRvk
## 27515 27 11 56.409 (1945,1950] MiðbærRvk
## 32214 51 12 165.000 (2005,2010] MiðbærRvk
## 34568 0 12 45.784 (2000,2005] MiðbærRvk
## 24426 45 12 170.460 (2015,2020] Seltjarnarnes
## 24818 0 12 23.344 (1900,1905] MiðbærRvk
## 33308 51 12 125.710 (2010,2015] MiðbærRvk
## 25776 0 11 122.524 (1925,1930] MiðbærRvk
## 29107 0 11 32.986 (1895,1900] Hafnarfjörður
## 10150 51 12 122.766 (2010,2015] MiðbærRvk
## 33082 0 11 192.228 (1930,1935] MiðbærRvk
## 31784 0 12 86.469 (1930,1935] MiðbærRvk
## undirmatssvaedi_cat Price_m2
## 28415 0 1160.4739
## 4703 1 1114.2687
## 19089 1 1106.6848
## 19090 1 1020.0667
## 31419 1 1013.3993
## 34048 0 1013.2159
## 27218 0 1012.9310
## 9314 1 1002.7096
## 34567 0 1001.1161
## 27515 0 996.6254
## 32214 1 991.5865
## 34568 0 990.9957
## 24426 1 982.4784
## 24818 0 980.8403
## 33308 1 976.0093
## 25776 0 961.7268
## 29107 0 961.6910
## 10150 1 953.1522
## 33082 0 946.0039
## 31784 0 943.9847
remove(data.s)
It appears that sub-area nr. 51 (101-Skuggahverfi-Sjávarsíða) comes up very frequently.
\(~\)
data %>% filter(undirmatssvaedi == "51") %>%
summarize(Count=n(),
Meðalfermetraverð = round(mean(Price_m2),1),
Staðalfrávik = round(sd(Price_m2),1),) %>%
mutate(Percent = round((Count/sum(Count)*100))) %>%
arrange(desc(Meðalfermetraverð))
## Count Meðalfermetraverð Staðalfrávik Percent
## 1 79 612.6 183.9 100
So the median price/m2 for properties in this sub-area is very high, or 613 compared to 345 for the whole capital area.
\(~\)
data %>% filter(undirmatssvaedi != "0") %>%
ggplot(aes(x=undirmatssvaedi, y=Price_m2)) + geom_boxplot() +
theme_classic() + labs(x="Area", y="Price per m^2 (thousand ISK)") + ylim(0,1000) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)
\(~\)
data %>% filter(undirmatssvaedi != "0") %>%
group_by(undirmatssvaedi) %>%
summarize(Count=n(),
Meðalfermetraverð = round(mean(Price_m2),1),
Staðalfrávik = round(sd(Price_m2),1),) %>%
mutate(Percent = round((Count/sum(Count)*100))) %>%
arrange(desc(Meðalfermetraverð))
## # A tibble: 67 x 5
## undirmatssvaedi Count Meðalfermetraverð Staðalfrávik Percent
## <fct> <int> <dbl> <dbl> <dbl>
## 1 51 79 613. 184. 1
## 2 11 5 564. 167. 0
## 3 57 14 547. 78.1 0
## 4 45 59 546. 106. 1
## 5 10 6 519. 111. 0
## 6 9 6 519. 136. 0
## 7 106 39 515 82.4 1
## 8 52 103 514. 117. 2
## 9 32 7 506. 194 0
## 10 59 101 505. 103. 2
## # … with 57 more rows
The most expensive properties are in sub-areas:
\(~\)
data %>% group_by(teg_eign) %>%
summarize(Count=n(),
MedianPriceM2 = round(median(Price_m2)),
MeanPriceM2 = round(mean(Price_m2),1),
StdDev = round(sd(Price_m2),1),) %>%
mutate(Percent = round((Count/sum(Count)*100))) %>%
arrange(desc(MeanPriceM2)) %>% kable() %>%
kable_styling(full_width = F, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1:6, width = "10em")
| teg_eign | Count | MedianPriceM2 | MeanPriceM2 | StdDev | Percent |
|---|---|---|---|---|---|
| apartment building | 66 | 397 | 399.3 | 121.7 | 0 |
| illegal apartment | 66 | 356 | 389.8 | 154.8 | 0 |
| apartment | 28765 | 349 | 365.9 | 104.8 | 82 |
| single-family property | 3176 | 333 | 347.2 | 102.1 | 9 |
| two-family building | 780 | 327 | 341.1 | 88.8 | 2 |
| town-house | 2253 | 319 | 329.8 | 88.4 | 6 |
data %>%
ggplot(aes(x=teg_eign, y=Price_m2)) + geom_boxplot() +
theme_classic() + labs(x="Type of property", y="Price per m^2 (thousand ISK)") + ylim(0,1200) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)
data %>% filter(teg_eign == "apartment building") %>%
group_by(matssvaedi_samein) %>%
summarize(Count=n(),
MedianPriceM2 = round(median(Price_m2)),
MeanPriceM2 = round(mean(Price_m2),1),
StdDev = round(sd(Price_m2),1),) %>%
mutate(Percent = round((Count/sum(Count)*100))) %>%
arrange(desc(MeanPriceM2)) %>% kable() %>%
kable_styling(full_width = F, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1:6, width = "10em")
| matssvaedi_samein | Count | MedianPriceM2 | MeanPriceM2 | StdDev | Percent |
|---|---|---|---|---|---|
| MiðbærRvk | 39 | 450 | 441.9 | 128.5 | 59 |
| GarðabærSuður | 9 | 439 | 433.9 | 24.9 | 14 |
| VesturbærRvk | 9 | 288 | 305.0 | 56.0 | 14 |
| Hafnarfjörður | 7 | 270 | 278.3 | 23.8 | 11 |
| AusturRvk | 1 | 265 | 265.2 | NA | 2 |
| AustAusturRvk | 1 | 259 | 259.2 | NA | 2 |
data %>% filter(teg_eign == "illegal apartment") %>%
group_by(matssvaedi_samein) %>%
summarize(Count=n(),
MiðgildiFmVerðs = round(median(Price_m2)),
Meðalfermetraverð = round(mean(Price_m2),1),
Staðalfrávik = round(sd(Price_m2),1),) %>%
mutate(Percent = round((Count/sum(Count)*100))) %>%
arrange(desc(Meðalfermetraverð)) %>% kable()
| matssvaedi_samein | Count | MiðgildiFmVerðs | Meðalfermetraverð | Staðalfrávik | Percent |
|---|---|---|---|---|---|
| VesturbærRvk | 11 | 483 | 475.5 | 190.5 | 17 |
| MiðbærRvk | 25 | 382 | 400.4 | 138.5 | 38 |
| Árbær | 1 | 393 | 392.8 | NA | 2 |
| AusturRvk | 15 | 341 | 369.3 | 161.0 | 23 |
| AustAusturRvk | 7 | 346 | 359.1 | 153.1 | 11 |
| Breiðholt | 4 | 310 | 331.6 | 128.0 | 6 |
| Hafnarfjörður | 2 | 261 | 261.1 | 1.4 | 3 |
| GarðabærNorður | 1 | 192 | 191.9 | NA | 2 |
ggplot(data, aes(x=stig10, y=Price_m2)) + geom_boxplot() +
theme_classic() + labs(x="stig10", y="Price per m^2 (thousand ISK)") + ylim(0,1000) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)
Apparently stig10 4-7 are more expensive. Reduce stig10 to two variables: >=7 and >7.
data$stig10_sam <- as.factor(data$stig10)
data$stig10_sam <- recode(data$stig10_sam,
"c('4','5.5','7','7.1')='low';
c('7.6','7.9','8','8.3','8.4','8.5','8.6','8.8','8.9','9','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','10')='high'")
levels(data$stig10_sam)
## [1] "high" "low"
ggplot(data, aes(x=stig10_sam, y=Price_m2)) + geom_boxplot() +
theme_classic() + labs(x="stig10", y="Price per m^2 (thousand ISK)") + ylim(0,1200) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)
ggplot(data, aes(x=lyfta, y=Price_m2)) + geom_boxplot() +
theme_classic() + labs(x="Fjöldi lyfta", y="Price per m^2 (thousand ISK)") + ylim(0,1200) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)
data1<- data.frame(data)
data1[,c('nuvirdi','nuvirdi_m','rfastnum','Price_m2','matssvaedi','matssvaedi_samein')] <- list(NULL)
data_n <- mutate_if(data1, is.factor, ~ as.numeric(levels(.x))[.x]) #factor to numeric
data_n <- mutate_if(data_n, is.character, ~ as.numeric(levels(.x))[.x]) #character to numeric
cor<-cor(data_n,method="spearman")
round(cor,2)
## kdagur teg_eign svfn byggar efstah fjmib lyfta ibm2
## kdagur 1 NA NA NA NA NA NA NA
## teg_eign NA 1 NA NA NA NA NA NA
## svfn NA NA 1 NA NA NA NA NA
## byggar NA NA NA 1.00 0.25 -0.07 0.53 0.25
## efstah NA NA NA 0.25 1.00 -0.08 0.64 -0.32
## fjmib NA NA NA -0.07 -0.08 1.00 -0.04 0.09
## lyfta NA NA NA 0.53 0.64 -0.04 1.00 -0.05
## ibm2 NA NA NA 0.25 -0.32 0.09 -0.05 1.00
## fjhaed NA NA NA -0.11 -0.34 0.07 -0.24 0.50
## fjbilsk NA NA NA 0.51 0.40 -0.03 0.60 0.08
## fjbkar NA NA NA -0.08 -0.12 0.05 -0.18 0.27
## fjsturt NA NA NA 0.24 -0.12 0.05 0.11 0.27
## fjklos NA NA NA 0.06 -0.34 0.14 -0.13 0.59
## fjeld NA NA NA -0.03 -0.07 0.35 -0.03 0.12
## fjherb NA NA NA 0.06 -0.35 0.10 -0.19 0.81
## fjstof NA NA NA -0.22 -0.30 0.14 -0.21 0.42
## fjgeym NA NA NA 0.10 0.07 0.00 0.00 0.01
## stig10 NA NA NA -0.13 0.02 0.00 -0.04 -0.07
## bilskurm2 NA NA NA -0.03 -0.51 0.07 -0.29 0.57
## svalm2 NA NA NA 0.22 0.34 -0.02 0.22 0.01
## geymm2 NA NA NA 0.26 0.41 -0.06 0.36 -0.17
## undirmatssvaedi NA NA NA 0.06 0.23 -0.01 0.19 -0.01
## ibteg NA NA NA 0.10 0.67 -0.10 0.32 -0.51
## byggar_cat NA NA NA NA NA NA NA NA
## undirmatssvaedi_cat NA NA NA NA NA NA NA NA
## stig10_sam NA NA NA NA NA NA NA NA
## fjhaed fjbilsk fjbkar fjsturt fjklos fjeld fjherb
## kdagur NA NA NA NA NA NA NA
## teg_eign NA NA NA NA NA NA NA
## svfn NA NA NA NA NA NA NA
## byggar -0.11 0.51 -0.08 0.24 0.06 -0.03 0.06
## efstah -0.34 0.40 -0.12 -0.12 -0.34 -0.07 -0.35
## fjmib 0.07 -0.03 0.05 0.05 0.14 0.35 0.10
## lyfta -0.24 0.60 -0.18 0.11 -0.13 -0.03 -0.19
## ibm2 0.50 0.08 0.27 0.27 0.59 0.12 0.81
## fjhaed 1.00 -0.16 0.16 0.18 0.59 0.14 0.50
## fjbilsk -0.16 1.00 -0.11 0.10 -0.06 -0.03 -0.06
## fjbkar 0.16 -0.11 1.00 -0.45 0.18 0.08 0.32
## fjsturt 0.18 0.10 -0.45 1.00 0.34 0.09 0.15
## fjklos 0.59 -0.06 0.18 0.34 1.00 0.20 0.51
## fjeld 0.14 -0.03 0.08 0.09 0.20 1.00 0.16
## fjherb 0.50 -0.06 0.32 0.15 0.51 0.16 1.00
## fjstof 0.39 -0.18 0.16 0.12 0.40 0.24 0.35
## fjgeym -0.03 0.01 -0.08 0.06 0.02 0.06 -0.02
## stig10 -0.03 -0.03 0.05 -0.07 -0.06 0.00 -0.04
## bilskurm2 0.43 -0.28 0.20 0.20 0.50 0.09 0.51
## svalm2 -0.03 0.17 0.07 -0.08 -0.05 0.01 0.01
## geymm2 -0.27 0.33 -0.06 -0.06 -0.22 -0.02 -0.17
## undirmatssvaedi -0.06 0.21 -0.10 0.05 0.00 -0.02 -0.07
## ibteg -0.56 0.25 -0.12 -0.24 -0.55 -0.10 -0.48
## byggar_cat NA NA NA NA NA NA NA
## undirmatssvaedi_cat NA NA NA NA NA NA NA
## stig10_sam NA NA NA NA NA NA NA
## fjstof fjgeym stig10 bilskurm2 svalm2 geymm2
## kdagur NA NA NA NA NA NA
## teg_eign NA NA NA NA NA NA
## svfn NA NA NA NA NA NA
## byggar -0.22 0.10 -0.13 -0.03 0.22 0.26
## efstah -0.30 0.07 0.02 -0.51 0.34 0.41
## fjmib 0.14 0.00 0.00 0.07 -0.02 -0.06
## lyfta -0.21 0.00 -0.04 -0.29 0.22 0.36
## ibm2 0.42 0.01 -0.07 0.57 0.01 -0.17
## fjhaed 0.39 -0.03 -0.03 0.43 -0.03 -0.27
## fjbilsk -0.18 0.01 -0.03 -0.28 0.17 0.33
## fjbkar 0.16 -0.08 0.05 0.20 0.07 -0.06
## fjsturt 0.12 0.06 -0.07 0.20 -0.08 -0.06
## fjklos 0.40 0.02 -0.06 0.50 -0.05 -0.22
## fjeld 0.24 0.06 0.00 0.09 0.01 -0.02
## fjherb 0.35 -0.02 -0.04 0.51 0.01 -0.17
## fjstof 1.00 0.00 0.00 0.40 -0.06 -0.21
## fjgeym 0.00 1.00 -0.06 -0.06 0.05 0.03
## stig10 0.00 -0.06 1.00 -0.07 -0.02 -0.03
## bilskurm2 0.40 -0.06 -0.07 1.00 -0.11 -0.30
## svalm2 -0.06 0.05 -0.02 -0.11 1.00 0.23
## geymm2 -0.21 0.03 -0.03 -0.30 0.23 1.00
## undirmatssvaedi -0.04 0.00 0.02 -0.11 0.06 0.13
## ibteg -0.40 0.07 0.06 -0.62 0.28 0.40
## byggar_cat NA NA NA NA NA NA
## undirmatssvaedi_cat NA NA NA NA NA NA
## stig10_sam NA NA NA NA NA NA
## undirmatssvaedi ibteg byggar_cat undirmatssvaedi_cat
## kdagur NA NA NA NA
## teg_eign NA NA NA NA
## svfn NA NA NA NA
## byggar 0.06 0.10 NA NA
## efstah 0.23 0.67 NA NA
## fjmib -0.01 -0.10 NA NA
## lyfta 0.19 0.32 NA NA
## ibm2 -0.01 -0.51 NA NA
## fjhaed -0.06 -0.56 NA NA
## fjbilsk 0.21 0.25 NA NA
## fjbkar -0.10 -0.12 NA NA
## fjsturt 0.05 -0.24 NA NA
## fjklos 0.00 -0.55 NA NA
## fjeld -0.02 -0.10 NA NA
## fjherb -0.07 -0.48 NA NA
## fjstof -0.04 -0.40 NA NA
## fjgeym 0.00 0.07 NA NA
## stig10 0.02 0.06 NA NA
## bilskurm2 -0.11 -0.62 NA NA
## svalm2 0.06 0.28 NA NA
## geymm2 0.13 0.40 NA NA
## undirmatssvaedi 1.00 0.13 NA NA
## ibteg 0.13 1.00 NA NA
## byggar_cat NA NA 1 NA
## undirmatssvaedi_cat NA NA NA 1
## stig10_sam NA NA NA NA
## stig10_sam
## kdagur NA
## teg_eign NA
## svfn NA
## byggar NA
## efstah NA
## fjmib NA
## lyfta NA
## ibm2 NA
## fjhaed NA
## fjbilsk NA
## fjbkar NA
## fjsturt NA
## fjklos NA
## fjeld NA
## fjherb NA
## fjstof NA
## fjgeym NA
## stig10 NA
## bilskurm2 NA
## svalm2 NA
## geymm2 NA
## undirmatssvaedi NA
## ibteg NA
## byggar_cat NA
## undirmatssvaedi_cat NA
## stig10_sam 1
corrplot(cor, method="circle")
So, order by the coefficients,
=> ibm2(property area) is correlated with fjherb(no of rooms), fjklos(no of toilets), bilskurm2(garage area), ibteg(type of property), fjhaed(no of floors in property),
=> lyfta(whether there is elavator or not) is correlated with fjbilsk(no of garage), efstah(wheter the room is on top floor)
==> We decide to remove fjherb, fjklos, bilskurm2, ibteg, fjhaed, fjbilsk, efstah.
\(~\)
We remove the following variables that have clear correlations with other variables: - fjherb, fjklos, bilskurm2, ibteg, fjhaed, fjbilsk, efstah - rfastnum (not necessary for predictions)
data.small <- subset(data, select = c(kdagur, nuvirdi, teg_eign, svfn, byggar, fjmib, lyfta, ibm2, fjbkar, fjsturt, fjeld, fjstof, fjgeym, svalm2, geymm2, undirmatssvaedi_cat, matssvaedi_samein, byggar_cat))
data_n[,c('fjherb','fjklos','bilskurm2','ibteg','fjhaed','fjbilsk','efstah')] <- list(NULL) #Make a smaller dataset for PCA
data_n <- dummy.data.frame(data_n, names=c("kdagur", "teg_eign", "svfn", "byggar_cat", "stig10_sam","undirmatssvaedi_cat"))
pr.out = prcomp(scale(data_n), scale=TRUE)
summary(pr.out)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.495 1.3704 1.1732 1.07568 1.02852 0.97827 0.93961
## Proportion of Variance 0.172 0.1444 0.1059 0.08901 0.08137 0.07362 0.06791
## Cumulative Proportion 0.172 0.3165 0.4224 0.51137 0.59274 0.66635 0.73427
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.90335 0.85862 0.80242 0.7631 0.62816 0.52956
## Proportion of Variance 0.06277 0.05671 0.04953 0.0448 0.03035 0.02157
## Cumulative Proportion 0.79704 0.85375 0.90328 0.9481 0.97843 1.00000
pve = 100*pr.out$sdev^2/sum(pr.out$sdev^2)
plot(pve, type="o", ylab="PVE", xlab="Principal Component", col="blue")
We can see that the first 5 PC make up 59.3% of the total variance explained. However, when we look at the graph, we see that while each of the first four principal components explain a substantial amount of variance, there is a marked decrease in the variance explained by further principal components. This suggests that there may be little benefit to examining more than four or so principal components.
#Loadings of PC
rotation <- pr.out$rotation
rotation[,1:5]
## PC1 PC2 PC3 PC4
## byggar 0.296675974 -0.41955135 0.209302477 0.281332856
## fjmib -0.257443186 -0.16794599 0.179059154 -0.550035971
## lyfta 0.376085160 -0.34399854 0.121579700 -0.028131624
## ibm2 -0.371259765 -0.40443305 0.107081114 0.427272074
## fjbkar -0.236194611 0.17175819 0.608638677 0.308102210
## fjsturt -0.080964831 -0.50551013 -0.427932717 0.053958048
## fjeld -0.279527000 -0.23922999 0.219641891 -0.496196023
## fjstof -0.465820449 -0.21910120 0.080883517 0.072169317
## fjgeym -0.008037599 -0.15415402 -0.038724965 -0.195433997
## stig10 -0.070018682 0.13483643 0.067734788 0.038648519
## svalm2 0.185924515 -0.07307232 0.439203430 -0.022871827
## geymm2 0.359134396 -0.09767168 0.300867382 -0.213078960
## undirmatssvaedi 0.201128367 -0.25839578 0.004657604 -0.009390208
## PC5
## byggar -0.073625879
## fjmib 0.157588399
## lyfta 0.240778902
## ibm2 -0.019960888
## fjbkar 0.004938184
## fjsturt -0.024584142
## fjeld 0.047385796
## fjstof 0.026990280
## fjgeym -0.571685683
## stig10 0.563602827
## svalm2 -0.264692312
## geymm2 -0.040416003
## undirmatssvaedi 0.437722989
Looking at the loadings of the first 5 PC (loadings <-0.4 or >0.4), we can see that:
PC1: (-) fjstof contributes negatively
PC2: (-) byggar, fjsturt, ibm2 contribute negatively
PC3: (+) fjbkar, svalm2 contribute positively; (-) fjsturt contributes negatively
PC4: (+) ibm2 contributes positively; (-) fjmib, fjeld contribute negatively
PC5: (+) stig10, undirmatssvaedi contribute positively; (-) fjgeym contributes negatively
\(~\)
#Create a training and test set
set.seed(3)
ind <- sample(nrow(data.small),nrow(data.small)/2)
train <- data.small[ind,]
test <- data.small[-ind,]
lm <- lm(nuvirdi ~ svfn + byggar + fjmib + lyfta + ibm2 + fjbkar + fjsturt + fjeld + fjstof+ fjgeym + svalm2 + geymm2, data=data.small)
tidy(lm)
## # A tibble: 23 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -10236. 6135. -1.67 9.52e- 2
## 2 svfnKópavogur -206. 166. -1.24 2.15e- 1
## 3 svfnSeltjarnarnes 8847. 425. 20.8 1.41e- 95
## 4 svfnGarðabær 4279. 239. 17.9 2.89e- 71
## 5 svfnHafnarfjörður -3896. 175. -22.2 8.86e-109
## 6 svfnMosfellsbær -1482. 327. -4.53 5.94e- 6
## 7 byggar 4.36 3.09 1.41 1.58e- 1
## 8 fjmib 9474. 742. 12.8 2.91e- 37
## 9 lyfta1 1015. 183. 5.55 2.84e- 8
## 10 lyfta2 2322. 250. 9.29 1.70e- 20
## # … with 13 more rows
plot_coeffs <- function(lm) {
coeffs <- coefficients(lm)
mp <- barplot(coeffs, col="#3F97D0", xaxt='n',
main="Regression Coefficients")
lablist <- names(coeffs)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
plot_pval <- function(lm) {
p_values <- coef(summary(lm))[,4]
mp <- barplot(p_values, col="#3F97D0", xaxt='n',
main="P-values")
lablist <- names(p_values)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
par(mfrow =c(2,1))
plot_coeffs(lm)
plot_pval(lm)
Effects of variables on price (nuvirdi) in model:
lm2 <- lm(nuvirdi ~ svfn + byggar + fjmib + lyfta + ibm2 + fjbkar + fjsturt + fjeld + fjstof+ fjgeym + svalm2 + geymm2, data=train)
#Calculate test MSE
lm.pred = predict(lm2,test)
mean((lm.pred-test$nuvirdi)^2)
## [1] 112588388
\(~\)
#Exclude some variables that include svaedi and/or have many categories
glm.fit <- glm(nuvirdi ~. - kdagur - matssvaedi_samein -byggar_cat, data=data.small)
tidy(glm.fit)
## # A tibble: 29 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4927. 6058. -0.813 4.16e- 1
## 2 teg_eignapartment building -1228. 1305. -0.941 3.47e- 1
## 3 teg_eignapartment -6572. 266. -24.7 1.69e-133
## 4 teg_eignillegal apartment -10896. 1302. -8.37 5.88e- 17
## 5 teg_eigntwo-family building -2930. 417. -7.03 2.05e- 12
## 6 teg_eigntown-house -5138. 291. -17.7 1.71e- 69
## 7 svfnKópavogur -321. 163. -1.98 4.82e- 2
## 8 svfnSeltjarnarnes 7820. 417. 18.8 3.78e- 78
## 9 svfnGarðabær 3800. 235. 16.1 2.12e- 58
## 10 svfnHafnarfjörður -4917. 173. -28.4 6.14e-175
## # … with 19 more rows
plot_coeffs <- function(glm.fit) {
coeffs <- coefficients(glm.fit)
mp <- barplot(coeffs, col="#3F97D0", xaxt='n',
main="Regression Coefficients")
lablist <- names(coeffs)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
plot_pval <- function(glm.fit) {
p_values <- coef(summary(glm.fit))[,4]
mp <- barplot(p_values, col="#3F97D0", xaxt='n',
main="P-values")
lablist <- names(p_values)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
par(mfrow =c(2,1))
plot_coeffs(glm.fit)
plot_pval(glm.fit)
Effects of variable on price:
Summary:
#General linear model to predict price based on area and byggar categories
glm.fit2 <- glm(nuvirdi ~ matssvaedi_samein, data=data.small)
tidy(glm.fit2)
## # A tibble: 15 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 41102. 930. 44.2 0.
## 2 matssvaedi_sameinÁrbær -7630. 1014. -7.53 5.31e-14
## 3 matssvaedi_sameinAustAusturRvk -1769. 1029. -1.72 8.57e- 2
## 4 matssvaedi_sameinAusturRvk -7252. 955. -7.59 3.27e-14
## 5 matssvaedi_sameinBreiðholt -12398. 976. -12.7 6.41e-37
## 6 matssvaedi_sameinGarðabærNorður 8550. 1006. 8.50 2.01e-17
## 7 matssvaedi_sameinGarðabærSuður 11669. 1322. 8.83 1.13e-18
## 8 matssvaedi_sameinGrafarvogur/Graf… -5113. 971. -5.26 1.41e- 7
## 9 matssvaedi_sameinHafnarfjörður -6562. 960. -6.83 8.34e-12
## 10 matssvaedi_sameinKópavogur -2301. 954. -2.41 1.58e- 2
## 11 matssvaedi_sameinMiðbærRvk -3560. 992. -3.59 3.33e- 4
## 12 matssvaedi_sameinMosfellsbærHelga… -535. 1538. -0.348 7.28e- 1
## 13 matssvaedi_sameinMosfellsbærVest/… -871. 1062. -0.820 4.12e- 1
## 14 matssvaedi_sameinSeltjarnarnes 10687. 1136. 9.41 5.35e-21
## 15 matssvaedi_sameinVesturbærRvk -3412. 994. -3.43 5.96e- 4
plot_coeffs <- function(glm.fit2) {
coeffs <- coefficients(glm.fit2)
mp <- barplot(coeffs, col="#3F97D0", xaxt='n',
main="Regression Coefficients")
lablist <- names(coeffs)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
plot_pval <- function(glm.fit2) {
p_values <- coef(summary(glm.fit2))[,4]
mp <- barplot(p_values, col="#3F97D0", xaxt='n',
main="P-values")
lablist <- names(p_values)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
par(mfrow =c(2,1))
plot_coeffs(glm.fit2)
plot_pval(glm.fit2)
Areas comapared to AustAusturRvk
#Exclude some variables that include svaedi and/or have many categories
glm.fit4 <- glm(nuvirdi ~. - kdagur - matssvaedi_samein -byggar_cat, data=train)
#Calculate test MSE
glm.fit4$xlevels[["byggar_cat"]] <- union(glm.fit4$xlevels[["byggar_cat"]], levels(test$byggar_cat))
glm.pred = predict(glm.fit4,newdata =test)
mean((glm.pred-test$nuvirdi)^2)
## [1] 107962700
\(~\)
set.seed(2)
#Creata an x matrix for the model fit and a lambda grid
x <- model.matrix(nuvirdi ~ . -nuvirdi -lyfta -matssvaedi_samein -byggar_cat, data.small)
y <- data.small$nuvirdi
lambda <- 10^seq(10, -2, length = 100)
#Split the data into training and validation sets
train.l = sample(nrow(x), nrow(x)/2)
test.l = (-train.l)
ytest = y[test.l]
#Perform cross-validation to find the optimal lambda value for lasso testing and plot lambda vs. MSE
cv.out <- cv.glmnet(x[train.l,], y[train.l], alpha = 1)
bestlam <- cv.out$lambda.min
bestlam
## [1] 13.45456
plot(cv.out)
#Fit a lasso model using the training data set and predict the best model using the test set.
lasso.mod <- glmnet(x[train.l,], y[train.l], alpha = 1, lambda = bestlam)
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test.l,])
#Calculage the MSE
mean((lasso.pred-ytest)^2)
## [1] 67250135
#Find the coefficients for the best model defined by lasso.
lasso.coef <- predict(lasso.mod, type = 'coefficients', s = bestlam)
lasso.coef %>% tidy()
## row column value
## 1 (Intercept) 1 -2.073986e+04
## 2 kdagur2013 1 1.837146e+03
## 3 kdagur2014 1 4.445607e+03
## 4 kdagur2015 1 7.139038e+03
## 5 kdagur2016 1 1.147419e+04
## 6 kdagur2017 1 1.875075e+04
## 7 kdagur2018 1 1.957934e+04
## 8 teg_eignapartment building 1 5.408863e+02
## 9 teg_eignapartment 1 -6.365816e+03
## 10 teg_eignillegal apartment 1 -1.059803e+04
## 11 teg_eigntwo-family building 1 -3.390557e+03
## 12 teg_eigntown-house 1 -5.097201e+03
## 13 svfnKópavogur 1 -1.576999e+02
## 14 svfnSeltjarnarnes 1 8.214351e+03
## 15 svfnGarðabær 1 3.194727e+03
## 16 svfnHafnarfjörður 1 -5.057519e+03
## 17 svfnMosfellsbær 1 -2.792108e+03
## 18 byggar 1 1.125526e+01
## 19 fjmib 1 6.896599e+03
## 20 ibm2 1 2.433579e+02
## 21 fjbkar 1 -7.598296e+02
## 22 fjsturt 1 2.023552e+03
## 23 fjeld 1 -2.691056e+03
## 24 fjstof 1 1.050546e+03
## 25 fjgeym 1 4.654945e+02
## 26 svalm2 1 -2.053533e-01
## 27 geymm2 1 2.565614e+00
## 28 undirmatssvaedi_cat1 1 1.206995e+04
coef(lasso.mod, s = bestlam) %>%
broom::tidy() %>%
filter(row != "(Intercept)") %>%
top_n(20, wt = abs(value)) %>%
ggplot(aes(value, reorder(row, value), color = value > 0)) +
geom_point(show.legend = FALSE) +
ggtitle("Top 20 influential variables (lasso penalty)") +
xlab("Coefficient") +
ylab(NULL) + theme_bw()
\(~\)
regfit.fwd = regsubsets (nuvirdi ~., data = train, method = "forward", nvmax = 19)
## Reordering variables and trying again:
reg.summary.fwd <- summary(regfit.fwd)
reg.summary.fwd$adjr2
## [1] 0.5926329 0.6722047 0.6984602 0.7194390 0.7377669 0.7508385 0.7661221
## [8] 0.7777080 0.7850158 0.7897722 0.7939195 0.7982194 0.8030102 0.8068853
## [15] 0.8096696 0.8123267 0.8147420 0.8176396 0.8201330 0.8226176
which.max(reg.summary.fwd$adjr2)
## [1] 20
plot(reg.summary.fwd$adjr2, xlab = "Number of Variables", ylab="Adj R^2", type = "b", col="blue")
Forward selection: Using all variables gives the best prediction.
#Test set
regfit.fwd2 = regsubsets (nuvirdi ~., data = test, method = "forward", nvmax = 19)
## Reordering variables and trying again:
reg.summary.fwd2 <- summary(regfit.fwd2)
reg.summary.fwd2$adjr2
## [1] 0.5878741 0.6686811 0.6946132 0.7114390 0.7284384 0.7407086 0.7557391
## [8] 0.7658533 0.7724712 0.7774559 0.7827701 0.7878955 0.7924211 0.7961447
## [15] 0.7994310 0.8027678 0.8056707 0.8084389 0.8112266 0.8134964
which.max(reg.summary.fwd2$adjr2)
## [1] 20
plot(reg.summary.fwd2$adjr2, xlab = "Number of Variables", ylab="Adj R^2", type = "b", col="blue")
test.mat=model.matrix (nuvirdi~.,data=test)
val.errors =rep(NA ,19)
for(i in 1:19){
coefi = coef(regfit.fwd2,id=i)
pred=test.mat[,names(coefi)]%*% coefi
val.errors[i]= mean((test$nuvirdi-pred)^2) }
val.errors
## [1] 125343020 100760810 92869049 87747279 82573277 78837825 74263553
## [8] 71184433 70687517 70687517 70687517 69259069 69200241 67967624
## [15] 67901073 67244432 66276395 66275311 66054048
which.min (val.errors )
## [1] 19
\(~\)
tree <- tree(formula = nuvirdi ~. - byggar_cat, data = data.small)
tree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 35106 1.053e+13 36860
## 2) ibm2 < 122.85 26510 2.706e+12 30510
## 4) kdagur < 2015.5 16101 1.046e+12 26490
## 8) ibm2 < 92.75 9973 3.667e+11 22990 *
## 9) ibm2 > 92.75 6128 3.597e+11 32170 *
## 5) kdagur > 2015.5 10409 9.975e+11 36730
## 10) ibm2 < 89.75 5969 2.967e+11 32130 *
## 11) ibm2 > 89.75 4440 4.046e+11 42910 *
## 3) ibm2 > 122.85 8596 3.453e+12 56460
## 6) ibm2 < 197.55 6861 1.632e+12 52190
## 12) kdagur < 2015.5 4089 6.881e+11 46300
## 24) matssvaedi_samein: Álftanes,Árbær,AustAusturRvk,AusturRvk,Breiðholt,GarðabærNorður,Grafarvogur/Grafarholt,Hafnarfjörður,Kópavogur,MosfellsbærHelgafell,MosfellsbærVest/Kjalarnes 3577 3.803e+11 44330 *
## 25) matssvaedi_samein: GarðabærSuður,MiðbærRvk,Seltjarnarnes,VesturbærRvk 512 1.964e+11 60100 *
## 13) kdagur > 2015.5 2772 5.932e+11 60870 *
## 7) ibm2 > 197.55 1735 1.201e+12 73350
## 14) matssvaedi_samein: Álftanes,Árbær,AustAusturRvk,Breiðholt,Grafarvogur/Grafarholt,Hafnarfjörður,Kópavogur,MosfellsbærHelgafell,MosfellsbærVest/Kjalarnes 1169 3.845e+11 66010 *
## 15) matssvaedi_samein: AusturRvk,GarðabærNorður,GarðabærSuður,MiðbærRvk,Seltjarnarnes,VesturbærRvk 566 6.232e+11 88520
## 30) ibm2 < 327.15 528 3.647e+11 84270 *
## 31) ibm2 > 327.15 38 1.164e+11 147600 *
summary(tree)
##
## Regression tree:
## tree(formula = nuvirdi ~ . - byggar_cat, data = data.small)
## Variables actually used in tree construction:
## [1] "ibm2" "kdagur" "matssvaedi_samein"
## Number of terminal nodes: 10
## Residual mean deviance: 98680000 = 3.463e+12 / 35100
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -97650.0 -5637.0 -693.7 0.0 4518.0 196300.0
plot(tree)
text(tree)
tree.pred = predict(tree, newdata=test)
mean((tree.pred-test$nuvirdi)^2)
## [1] 101173719
matssvaedi_samein:
AustAusturRvk, Álftanes, Árbær, Breiðholt, Grafarvogur/Grafarholt, Hafnarfjörður, Kópavogur, MosfellsbærHelgafell, MosfellsbærVest/Kjalarnes
vs.
\(~\)
# Create a decision tree model
data.small$price_cat <- ifelse(data.small$nuvirdi>median(data.small$nuvirdi), "1", "0")
data.small <- mutate(data.small, price_cat=fct_recode(price_cat, Low="0", High="1"))
data.small$price_cat <- ifelse(data.small$nuvirdi > median(data.small$nuvirdi), "1", "0")
data.small <- mutate(data.small, price_cat = fct_recode(price_cat, Low ="0", High="1"))
tree2 <- rpart(price_cat ~ .-nuvirdi, data = data.small, cp=.02)
summary(tree2)
## Call:
## rpart(formula = price_cat ~ . - nuvirdi, data = data.small, cp = 0.02)
## n= 35106
##
## CP nsplit rel error xerror xstd
## 1 0.56819736 0 1.0000000 1.0176618 0.005336626
## 2 0.07862352 1 0.4318026 0.4322584 0.004393737
## 3 0.03048086 2 0.3531791 0.3532931 0.004070990
## 4 0.02000000 3 0.3226983 0.3228122 0.003927262
##
## Variable importance
## ibm2 teg_eign kdagur fjstof
## 46 16 13 11
## matssvaedi_samein svfn fjsturt byggar
## 4 4 4 1
## byggar_cat
## 1
##
## Node number 1: 35106 observations, complexity param=0.5681974
## predicted class=Low expected loss=0.4999715 P(node) =1
## class counts: 17554 17552
## probabilities: 0.500 0.500
## left son=2 (20175 obs) right son=3 (14931 obs)
## Primary splits:
## ibm2 < 101.85 to the left, improve=5796.617, (0 missing)
## teg_eign splits as RRLLRR, improve=2900.233, (0 missing)
## kdagur splits as LLLLRRR, improve=1968.250, (0 missing)
## fjstof < 1.5 to the left, improve=1364.019, (0 missing)
## fjsturt < 0.5 to the left, improve=1103.771, (0 missing)
## Surrogate splits:
## teg_eign splits as RRLLRR, agree=0.732, adj=0.371, (0 split)
## fjstof < 1.5 to the left, agree=0.683, adj=0.254, (0 split)
## matssvaedi_samein splits as RLLLLRRLLRLLRRL, agree=0.616, adj=0.096, (0 split)
## svfn splits as LRRRLR, agree=0.615, adj=0.095, (0 split)
## fjsturt < 1.5 to the left, agree=0.614, adj=0.092, (0 split)
##
## Node number 2: 20175 observations, complexity param=0.07862352
## predicted class=Low expected loss=0.2527881 P(node) =0.5746881
## class counts: 15075 5100
## probabilities: 0.747 0.253
## left son=4 (16129 obs) right son=5 (4046 obs)
## Primary splits:
## kdagur splits as LLLLLRR, improve=1766.4300, (0 missing)
## ibm2 < 79.65 to the left, improve= 634.8837, (0 missing)
## byggar < 2013.5 to the left, improve= 544.7991, (0 missing)
## byggar_cat splits as R--L---LRL-LLLLLLLLLLLLLLLLLLLLLLLLR, improve= 426.0349, (0 missing)
## matssvaedi_samein splits as RLRRLRRRLRRRRRR, improve= 237.6448, (0 missing)
## Surrogate splits:
## byggar < 2016.5 to the left, agree=0.812, adj=0.061, (0 split)
## byggar_cat splits as L--L---LLL-LLLLLLLLLLLLLLLLLLLLLLLLR, agree=0.805, adj=0.025, (0 split)
## matssvaedi_samein splits as LLLLLLRLLLLRLLL, agree=0.803, adj=0.017, (0 split)
##
## Node number 3: 14931 observations
## predicted class=High expected loss=0.1660304 P(node) =0.4253119
## class counts: 2479 12452
## probabilities: 0.166 0.834
##
## Node number 4: 16129 observations
## predicted class=Low expected loss=0.1479943 P(node) =0.4594371
## class counts: 13742 2387
## probabilities: 0.852 0.148
##
## Node number 5: 4046 observations, complexity param=0.03048086
## predicted class=High expected loss=0.3294612 P(node) =0.115251
## class counts: 1333 2713
## probabilities: 0.329 0.671
## left son=10 (1257 obs) right son=11 (2789 obs)
## Primary splits:
## ibm2 < 67.35 to the left, improve=535.95330, (0 missing)
## byggar_cat splits as R------LR--LRRLLLLRLLLRRRLLLLRRRRRRR, improve=180.51100, (0 missing)
## byggar < 1989.5 to the left, improve=175.87460, (0 missing)
## matssvaedi_samein splits as RLRRLRRRRRRRRRR, improve=129.97060, (0 missing)
## geymm2 < 590.5 to the left, improve= 61.00849, (0 missing)
## Surrogate splits:
## byggar_cat splits as R------LR--LRLRLLLRLLRRRRRRRRRRRRRRR, agree=0.716, adj=0.085, (0 split)
## byggar < 1943.5 to the left, agree=0.714, adj=0.080, (0 split)
## matssvaedi_samein splits as RRRRRRRRRRLRRRR, agree=0.693, adj=0.013, (0 split)
## teg_eign splits as RLRLRR, agree=0.693, adj=0.010, (0 split)
## fjstof < 0.5 to the left, agree=0.692, adj=0.007, (0 split)
##
## Node number 10: 1257 observations
## predicted class=Low expected loss=0.2871917 P(node) =0.03580585
## class counts: 896 361
## probabilities: 0.713 0.287
##
## Node number 11: 2789 observations
## predicted class=High expected loss=0.156687 P(node) =0.07944511
## class counts: 437 2352
## probabilities: 0.157 0.843
# Visualize the decision tree with rpart.plot
rpart.plot(tree2, box.palette="RdBu", shadow.col="gray", nn=TRUE)
{r, warning=F, message=F} set.seed (1) rf =randomForest(nuvirdi~.,data=train, mtry=4, importance =TRUE) rf.pred = predict(rf,newdata =test) mean((rf.pred-test$nuvirdi)^2) vip(rf, num_features = 25, bar = FALSE) + ggtitle(“Variable importance”) + theme_bw()
\(~\)
#tune.out=tune(svm ,nuvirdi~., data=train,kernel ="linear", ranges =list(cost=c(1,2, 5,10,100), gamma=c(2,3,4) ))
svm.fit =svm(nuvirdi~., data=train, kernel ="linear", cost =2)
svmpred = predict(svm.fit,test)
mean((svmpred-test$nuvirdi)^2)
## [1] 55267350
Among the methods above, we can see that the random forest gives the smallest test MSE. And that the most important variables among all methods are: ibm2, teg_eign, matssvaedi_samein, svfn, kdagur.
{r, warning=F, message=F} #Create new data set with the variables above data.small2 <- subset(data.small, select = c(kdagur, nuvirdi, teg_eign, svfn, ibm2, matssvaedi_samein, fjsturt, lyfta, byggar)) set.seed(3)
ind2 <- sample(nrow(data.small2),nrow(data.small2)/2) train2 <- data.small[ind2,] test2 <- data.small[-ind2,]
rf2 =randomForest(nuvirdi~.,data=train2, mtry=2, importance =TRUE) rf.pred2 = predict(rf2,newdata =test) mean((rf.pred2-test2$nuvirdi)^2)
\(~\)
Like mentioned earlier, the regression tree earlier split the areas (matssvaedi) into two groups by price:
Low price areas:
High price areas:
We start with creating a new binary variable that splits areas into these two categories.
data.small <- mutate(data.small, matssvaedi_cat = fct_recode(matssvaedi_samein, "0"="AustAusturRvk",
"0"="AusturRvk",
"0"="Álftanes",
"0"="Árbær",
"0"="Breiðholt",
"0"="Grafarvogur/Grafarholt",
"0"="Hafnarfjörður",
"0"="Kópavogur",
"0"="MosfellsbærHelgafell",
"0"="MosfellsbærVest/Kjalarnes",
"1"="GarðabærNorður",
"1"="GarðabærSuður",
"1"="MiðbærRvk",
"1"="Seltjarnarnes",
"1"="VesturbærRvk"))
summary(data.small$matssvaedi_cat)
## 0 1
## 27777 7329
data.small$matssvaedi_cat <- as.numeric(data.small$matssvaedi_cat)
# Create a new dataset without connected variables (like matssvaedi_samein)
data.cat <- subset(data.small, select = c(kdagur, nuvirdi, teg_eign, byggar, fjmib, lyfta, ibm2, fjbkar, fjsturt, fjeld, fjstof, fjgeym, svalm2, geymm2, matssvaedi_cat))
#Make train and test datasets
train.cat <- data.cat[ind,]
test.cat <- data.cat[-ind,]
There are 27.777 properties in the “Low price areas” and 7329 properties in the “High price areas”.
\(~\)
# Create a decision tree model
tree.cat <- rpart(matssvaedi_cat~., data = train.cat, cp=.02)
summary(tree.cat)
## Call:
## rpart(formula = matssvaedi_cat ~ ., data = train.cat, cp = 0.02)
## n= 17553
##
## CP nsplit rel error xerror xstd
## 1 0.12415848 0 1.0000000 1.0001523 0.01091040
## 2 0.02147231 1 0.8758415 0.8760588 0.01135014
## 3 0.02000000 2 0.8543692 0.8638745 0.01124887
##
## Variable importance
## byggar lyfta teg_eign
## 84 15 1
##
## Node number 1: 17553 observations, complexity param=0.1241585
## mean=1.207144, MSE=0.1642354
## left son=2 (16547 obs) right son=3 (1006 obs)
## Primary splits:
## byggar < 1939.5 to the right, improve=0.124158500, (0 missing)
## nuvirdi < 76603 to the left, improve=0.019412110, (0 missing)
## lyfta splits as LLLRRRLL, improve=0.016717930, (0 missing)
## fjbkar < 0.5 to the right, improve=0.013143810, (0 missing)
## fjsturt < 0.5 to the left, improve=0.008275642, (0 missing)
## Surrogate splits:
## teg_eign splits as LRLRLL, agree=0.943, adj=0.012, (0 split)
## ibm2 < 26.35 to the right, agree=0.943, adj=0.003, (0 split)
## nuvirdi < 186820 to the left, agree=0.943, adj=0.002, (0 split)
## geymm2 < 758.5 to the left, agree=0.943, adj=0.001, (0 split)
##
## Node number 2: 16547 observations, complexity param=0.02147231
## mean=1.171934, MSE=0.142373
## left son=4 (15895 obs) right son=5 (652 obs)
## Primary splits:
## lyfta splits as LLLRRRLL, improve=0.026275440, (0 missing)
## nuvirdi < 45230.5 to the left, improve=0.025888620, (0 missing)
## byggar < 2013.5 to the left, improve=0.012171320, (0 missing)
## fjsturt < 0.5 to the left, improve=0.008937531, (0 missing)
## fjbkar < 0.5 to the right, improve=0.007275092, (0 missing)
##
## Node number 3: 1006 observations
## mean=1.786282, MSE=0.1680424
##
## Node number 4: 15895 observations
## mean=1.159547, MSE=0.1340918
##
## Node number 5: 652 observations
## mean=1.473926, MSE=0.2493202
# Visualize the decision tree with rpart.plot
rpart.plot(tree.cat, box.palette="RdBu", shadow.col="gray", nn=TRUE)
# Test on test set
tree.pred = predict(tree.cat, test.cat)
(err.tree <- mean((tree.pred-test.cat$matssvaedi_cat)^2))
## [1] 0.1391585
Results:
The variable byggar seems to matter most, that is, in recent years, more properties have been built in the “Low price areas” (because they are newer - see subchapter “Byggar ~ location (combined)”). There also seem to be different number of elevators in properties depending on our variable (most likely also linked to age (byggar), newer properties are likely to have more elevators than older ones).
We obtain a test MSE of roughly 0.14.
\(~\)
{r, warning=F, message=F} rf.fit <- randomForest(matssvaedi_cat~., data=train.cat, mtry=2) rf.probs <- predict(rf.fit, newdata=test.cat) rf.pred <- ifelse(rf.probs>0.5, “1”, “0”) table(test.cat$matssvaedi_cat, rf.pred)
Don’t think we can use random forests (Warning message: The response has 5 or fewer unique values, are you sure you want to do regression?)
\(~\)
lda.fit <- lda(matssvaedi_cat~., data=train.cat)
lda.fit
## Call:
## lda(matssvaedi_cat ~ ., data = train.cat)
##
## Prior probabilities of groups:
## 1 2
## 0.7928559 0.2071441
##
## Group means:
## kdagur2013 kdagur2014 kdagur2015 kdagur2016 kdagur2017 kdagur2018
## 1 0.1414098 0.1512539 0.1832291 0.1983186 0.1796364 0.02134081
## 2 0.1361386 0.1501650 0.1853685 0.1900440 0.1771177 0.02557756
## nuvirdi teg_eignapartment building teg_eignapartment
## 1 35257.19 0.0001437091 0.8217288
## 2 42610.20 0.0068756876 0.8217822
## teg_eignillegal apartment teg_eigntwo-family building teg_eigntown-house
## 1 0.0007185457 0.02356830 0.06646547
## 2 0.0046754675 0.01320132 0.05308031
## byggar fjmib lyfta1 lyfta2 lyfta3 lyfta4
## 1 1983.237 1.003665 0.1524754 0.06639362 0.01803550 0.004526838
## 2 1971.181 1.010726 0.1193619 0.05775578 0.05143014 0.016501650
## lyfta5 lyfta6 lyfta8 ibm2 fjbkar fjsturt fjeld
## 1 0.002083782 0.006035784 0.00172451 105.3660 0.7965797 0.5338076 1.004886
## 2 0.017051705 0.000000000 0.00000000 103.3867 0.6782178 0.6641914 1.009626
## fjstof fjgeym svalm2 geymm2
## 1 1.226557 0.6109075 247.4746 272.3390
## 2 1.280253 0.5552805 215.1672 254.5388
##
## Coefficients of linear discriminants:
## LD1
## kdagur2013 -2.962908e-01
## kdagur2014 -5.469451e-01
## kdagur2015 -7.822006e-01
## kdagur2016 -1.208779e+00
## kdagur2017 -1.907400e+00
## kdagur2018 -1.798832e+00
## nuvirdi 9.870398e-05
## teg_eignapartment building 2.820009e+00
## teg_eignapartment 7.233430e-01
## teg_eignillegal apartment 2.681230e+00
## teg_eigntwo-family building 1.025169e-01
## teg_eigntown-house 4.525220e-01
## byggar -2.907267e-02
## fjmib -4.825949e-01
## lyfta1 3.220267e-01
## lyfta2 2.165095e-01
## lyfta3 1.314860e+00
## lyfta4 1.744920e+00
## lyfta5 2.655697e+00
## lyfta6 -1.251785e+00
## lyfta8 -6.024940e-01
## ibm2 -2.256004e-02
## fjbkar -2.405085e-01
## fjsturt 1.805437e-01
## fjeld 1.950035e-01
## fjstof -1.040952e-01
## fjgeym -1.408140e-01
## svalm2 -4.350672e-05
## geymm2 -2.614400e-04
lda.pred <- predict(lda.fit, test.cat)
table(data=lda.pred$class, prediction=test.cat$matssvaedi_cat)
## prediction
## data 1 2
## 1 13351 2429
## 2 509 1264
(err.LDA <- mean(lda.pred$class!=test.cat$matssvaedi_cat))
## [1] 0.1673788
Results:
LDA with the the same variables as in the decision tree gives a test MSE of 0.17.
\(~\)
#Creata a matrix for the model fit and a lambda grid
x <- model.matrix(matssvaedi_cat~., data.cat)
y <- data.cat$matssvaedi_cat
lambda <- 10^seq(10, -2, length = 100)
#Split the data into training and validation sets
set.seed(1)
train.l = sample(nrow(x), nrow(x)/2)
test.l = (-train.l)
ytest = y[test.l]
#Perform cross-validation to find the optimal lambda value for lasso testing and plot lambda vs. MSE
cv.out <- cv.glmnet(x[train.l,], y[train.l], alpha = 1)
(bestlam <- cv.out$lambda.min)
## [1] 0.0001131944
plot(cv.out)
#Fit a lasso model using the training data set and predict the best model using the test set.
lasso.mod <- glmnet(x[train.l,], y[train.l], alpha = 1, lambda = bestlam)
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test.l,])
#Calculage the MSE
(err.lasso <- mean((lasso.pred-ytest)^2))
## [1] 0.1294638
#Find the coefficients for the best model defined by lasso.
lasso.coef <- predict(lasso.mod, type = 'coefficients', s = bestlam)
lasso.coef %>% tidy()
## row column value
## 1 (Intercept) 1 1.085610e+01
## 2 kdagur2013 1 -2.802122e-02
## 3 kdagur2014 1 -7.507919e-02
## 4 kdagur2015 1 -1.121351e-01
## 5 kdagur2016 1 -1.831740e-01
## 6 kdagur2017 1 -3.080610e-01
## 7 kdagur2018 1 -2.956879e-01
## 8 nuvirdi 1 1.660170e-05
## 9 teg_eignapartment building 1 5.161474e-01
## 10 teg_eignapartment 1 1.298891e-01
## 11 teg_eignillegal apartment 1 3.420785e-01
## 12 teg_eigntwo-family building 1 2.430116e-02
## 13 teg_eigntown-house 1 7.507783e-02
## 14 byggar 1 -4.908401e-03
## 15 fjmib 1 -9.812785e-02
## 16 lyfta1 1 5.042063e-02
## 17 lyfta2 1 3.858494e-02
## 18 lyfta3 1 2.436021e-01
## 19 lyfta4 1 3.242303e-01
## 20 lyfta5 1 4.245498e-01
## 21 lyfta6 1 -2.036992e-01
## 22 lyfta8 1 -9.996877e-02
## 23 ibm2 1 -3.805649e-03
## 24 fjbkar 1 -4.064817e-02
## 25 fjsturt 1 3.191027e-02
## 26 fjeld 1 1.646319e-02
## 27 fjstof 1 -1.504679e-02
## 28 fjgeym 1 -1.606324e-02
## 29 svalm2 1 -7.784295e-06
## 30 geymm2 1 -4.104561e-05
coef(lasso.mod, s = bestlam) %>%
broom::tidy() %>%
filter(row != "(Intercept)") %>%
top_n(20, wt = abs(value)) %>%
ggplot(aes(value, reorder(row, value), color = value > 0)) +
geom_point(show.legend = FALSE) +
ggtitle("Top 20 influential variables (lasso penalty)") +
xlab("Coefficient") +
ylab(NULL) + theme_bw()
Results:
Lasso selects a model with with lambda = 0.0001131944 where the test MSE = 0.13.
The Lasso is optimal when around 25 variables are included.
The most influential variables are:
\(~\)
glm.fit <- glm(matssvaedi_cat~., data=train.cat)
tidy(glm.fit)
## # A tibble: 30 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 10.8 0.285 37.9 1.27e-302
## 2 kdagur2013 -0.0497 0.0105 -4.74 2.20e- 6
## 3 kdagur2014 -0.0918 0.0104 -8.80 1.47e- 18
## 4 kdagur2015 -0.131 0.0102 -12.9 6.52e- 38
## 5 kdagur2016 -0.203 0.0105 -19.4 6.24e- 83
## 6 kdagur2017 -0.320 0.0117 -27.4 2.06e-162
## 7 kdagur2018 -0.302 0.0207 -14.6 4.51e- 48
## 8 nuvirdi 0.0000166 0.000000323 51.3 0.
## 9 teg_eignapartment building 0.473 0.0708 6.68 2.39e- 11
## 10 teg_eignapartment 0.121 0.0132 9.18 4.60e- 20
## # … with 20 more rows
plot_coeffs <- function(glm.fit) {
coeffs <- coefficients(glm.fit)
mp <- barplot(coeffs, col="#3F97D0", xaxt='n',
main="Regression Coefficients")
lablist <- names(coeffs)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
plot_pval <- function(glm.fit) {
p_values <- coef(summary(glm.fit))[,4]
mp <- barplot(p_values, col="#3F97D0", xaxt='n',
main="P-values")
lablist <- names(p_values)
text(mp, par("usr")[3], labels = lablist, srt = 45,
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}
plot_coeffs(glm.fit)
plot_pval(glm.fit)
glm.probs <- predict(glm.fit, test.cat, type="response")
glm.pred <- ifelse(glm.probs>0.5,"1","0")
table(glm.pred, test.cat$matssvaedi_cat)
##
## glm.pred 1 2
## 0 4 0
## 1 13856 3693
(err.glm <- mean(glm.pred!=test.cat$matssvaedi_cat))
## [1] 0.2106193
Results:
GLM with the the same variables as before gives a test MSE of 0.21.
\(~\)
#svm.linear <- svm(matssvaedi_cat~., data=train.cat, kernel="linear", cost=10)
Results:
RStudio keeps freezing when trying to take the svm further - skip for now
\(~\)
err.tree
## [1] 0.1391585
err.LDA
## [1] 0.1673788
err.lasso
## [1] 0.1294638
err.glm
## [1] 0.2106193
Results:
The Lasso and the Decision tree performed best on the dataset with test MSE of around 0.14.
\(~\)
Interpretation: ???????????????????